global root_dir = "`1'"

include "$root_dir/code/config/config.do"


cap noi log using ${log_dir}/3_map_ipc4_sic_using.log, replace name(dat)

*Handle empty arguments
global arg1 = cond("`2'" == "___EMPTY___", "", "`2'")
global arg2 = cond("`3'" == "___EMPTY___", "", "`3'")
global arg3 = cond("`4'" == "___EMPTY___", "", "`4'")
global arg4 = cond("`5'" == "___EMPTY___", "", "`5'")

if "$arg1" != "" {
    global weight_category "$arg1"
    di "Weight category: ${weight_category}"
}

if "$arg2" != "" {
    global weight_versions "$arg2"
    di "Weight versions: ${weight_versions}"
}

if "$arg3" != "" {
    global weight_window "$arg3"
    di "Weight window: ${weight_window}"
}

if "$arg4" != "" {
	global wtype "$arg4"
}
di "${wtype}"
capture noi {


* v3; 3_map_ipc4_to_sic_using.do

/* This do-file matches the make-flow table (step 2) with the IPC4 - SIC4 concordance (step 1).
Because the current_sic and sic_ind_make are at different levels of aggregation, this code does the following:
- A. Cases where the current_sic and sic_ind_make already match uniquely
- B. Cases where the current_sic and the sic_ind_make of one level aggregated match uniquely
        - Just Nursing, nothing else
- C. Take average of weights (go to sic3)
- D. Take average of weights (where some codes are already averages) (go to sic2)
- E. Difficult cases
- F. Finalize mapping
*/

**************************************
* A. Unique matching cases
**************************************

use ${alm_data_proc}/cw_ipc4_current_sic.dta, clear
* drop the SIC codes which we cannot match at all (also not below) or which are not a proper SIC 87 code
drop if inlist(current_sic, 3576,4888,4955,4991,7385,9995,9997,6199,6797)

* reshuffle weights such that they sum up again to 1 per ipc4 code
ren w_ipc4_current_sic weight_ipc4_sic_mfg
egen totalw =total(weight_ipc4_sic_mfg), by(ipc4)
replace weight_ipc4_sic_mfg=weight_ipc4_sic_mfg/totalw
drop totalw

*make current sic into same format as sic4
tostring current_sic, gen(current_sic_string)
replace current_sic_string = "0" + current_sic_string if current_sic<1000
drop current_sic
rename current_sic_string sic_ind_make

*Match with make flow table
joinby  sic_ind_make using ${alm_data_proc}/sic4_make_flow.dta, unmatched(both) 
drop if _merge==2
tab sic_ind_make if _merge==3 
rename (_merge weight_make_flow sic_ind_flow) (_merge_A weight_make_flow_A sic_ind_flow_A)

**************************************
* B. Unique by aggregating for "Nursing"; we do not need a subcaterogy 
**************************************

gen sic_ind_make_B = ""

replace sic_ind_make_B = "8050" if inlist(sic_ind_make, "8051")

preserve

    use ${alm_data_proc}/sic4_make_flow.dta, clear
    rename sic_ind_make sic_ind_make_B
    save ${tmp_dir}/ipc4_sic4_of_mfg_concordance_B.dta, replace

restore

joinby  sic_ind_make_B using ${tmp_dir}/ipc4_sic4_of_mfg_concordance_B.dta, unmatched(both) 
drop if _merge==2

assert _merge==3 if sic_ind_make=="8051"
assert _merge!=3 if sic_ind_make!="8051"

rename (_merge weight_make_flow sic_ind_flow) (_merge_B weight_make_flow_B sic_ind_flow_B)

*Combine weight of A and B, sic_ind_flow and merge in one variab le 
gen weight_make_flow_AB = weight_make_flow_A
replace weight_make_flow_AB = weight_make_flow_B if _merge_B ==3

gen sic_ind_flow_AB = sic_ind_flow_A
replace sic_ind_flow_AB = sic_ind_flow_B if _merge_B ==3
gen _merge_AB = _merge_A
replace _merge_AB =_merge_B if _merge_B ==3

drop *_A *_B

*save a unique crosswalk with nursing aggregated
save ${tmp_dir}/ipc4_sic_ind_make_AB.dta, replace

**************************************
* C. Take average of weights i.e. go one level up, from sic4 to sic3
**************************************

use ${alm_data_proc}/sic4_make_flow.dta, clear

*Generate the right level of aggregation
gen sic_ind_make_3 =substr(sic_ind_make, 1, 3)
replace sic_ind_make_3 = sic_ind_make_3 +"0"
gen sic_ind_make_C =""

*get toplevel sic3 codes for all relevant fields (machinery)
foreach x in 1090 1220 1520 1540 2040 2060 2070 2320 2330 2340 2390 2430 2540 2590 2650 2750 2780 2790 2810 2860 2870 2950 2990 3050 3060 3140 3250 3260 3290 3320 3360 3390 3420 3460 3470 3480 3490 3510 3590 3640 3730 3760 3790 3950 3960 3990 4610 5020 5110 5130 5150 5160 5190 5990 6020 7340 7510 7830 7990 8060 {
    replace sic_ind_make_C =sic_ind_make_3 if sic_ind_make_3 == "`x'" 
}
drop if sic_ind_make_C == ""
keep weight_make_flow sic_ind_make_C sic_ind_flow sic_ind_make total_value_make

*Generate average weights
egen weight_make_flow_C = wtmean(weight_make_flow), weight(total_value_make) by(sic_ind_make_C sic_ind_flow)

* test for weights sum == 1
egen test1 = total(weight_make_flow), by(sic_ind_make)
egen test2 = mean(test1), by(sic_ind_make_C)

duplicates drop sic_ind_make_C sic_ind_flow, force

egen test3 = total(weight_make_flow_C), by(sic_ind_make_C)
gen test4 = test2 - test3

*we do have some very very small rounding errors
assert test4 < 0.00000006
drop test* sic_ind_make weight_make_flow total_value_make


rename (sic_ind_make_C sic_ind_flow) (sic_ind_make sic_ind_flow_C)

*save a crosswalk for amchinery only, including correct weights
save ${tmp_dir}/ipc4_sic_ind_make_C.dta, replace

*Match with overall data
use ${tmp_dir}/ipc4_sic_ind_make_AB.dta, clear
joinby sic_ind_make using ${tmp_dir}/ipc4_sic_ind_make_C.dta, unmatched(both)
assert _merge!=2
rename _merge _merge_C

*consolidate to one variable
gen weight_make_flow_ABC = weight_make_flow_AB
replace weight_make_flow_ABC =weight_make_flow_C if _merge_C ==3

gen sic_ind_flow_ABC = sic_ind_flow_AB
replace sic_ind_flow_ABC = sic_ind_flow_C if _merge_C ==3
gen _merge_ABC = _merge_AB
replace _merge_ABC =_merge_C if _merge_C ==3 
drop *_AB *_C

*save the crosswalk with aggregated machinery codes and their aggregated weights
save ${tmp_dir}/ipc4_sic_ind_make_ABC.dta, replace

****************************************
* D. Take average of average of weights i.e. go two levels up, from sic4 to sic2
****************************************

use ${alm_data_proc}/sic4_make_flow.dta, clear

*assign right level of aggregation
gen sic_ind_make_2 = substr(sic_ind_make, 1, 2)
replace sic_ind_make_2 = sic_ind_make_2 +"00"

*Keep the ones which need average of average
gen keep_sics =. 
replace keep_sics=1 if inlist(sic_ind_make_2,"0700","0800","1400")
replace keep_sics=1 if inlist(sic_ind_make_2,"4100","5500","7200","7600","8200")
keep if keep_sics==1
drop keep_sics

gen sic_ind_make_D =""
foreach x in  0700 0800 1400 4100 5500 7200 7600 8200{
replace sic_ind_make_D = sic_ind_make_2 if sic_ind_make_2 == "`x'" 
}
gen sic_ind_make_3 = substr(sic_ind_make, 1, 3)
replace sic_ind_make_3 = sic_ind_make_3 +"0"

*first average
egen weight_make_flow_D_intm =wtmean(weight_make), weight(total_value_make) by(sic_ind_flow sic_ind_make_3)
duplicates drop sic_ind_flow sic_ind_make_3, force
drop weight_make_flow sic_ind_make sic_ind_make_2

*second average
egen weight_make_flow_D =wtmean(weight_make_flow_D_intm),weight(total_value_make) by(sic_ind_flow sic_ind_make_D)
keep sic_ind_flow sic_ind_make_D weight_make_flow_D
duplicates drop sic_ind_flow sic_ind_make_D, force

rename sic_ind_flow sic_ind_flow_D
rename sic_ind_make_D sic_ind_make

*again make sure we only have small rounding errors
egen test=total(weight_make_flow_D), by(sic_ind_make)
assert test>0.9999 & test<1.000001

save ${tmp_dir}/ipc4_sic_ind_make_D, replace

* match with data
use ${tmp_dir}/ipc4_sic_ind_make_ABC.dta, clear

joinby sic_ind_make using ${tmp_dir}/ipc4_sic_ind_make_D.dta, unmatched(both)

rename _merge _merge_D
assert _merge_D!=2

*assign to the one variable, save
gen weight_make_flow_ABCD = weight_make_flow_ABC
replace weight_make_flow_ABCD =weight_make_flow_D if _merge_D ==3
gen sic_ind_flow_ABCD = sic_ind_flow_ABC
replace sic_ind_flow_ABCD = sic_ind_flow_D if _merge_D ==3
gen _merge_ABCD = _merge_ABC
replace _merge_ABCD =_merge_D if _merge_D ==3 
drop *_ABC

save ${tmp_dir}/ipc4_sic_ind_make_ABCD.dta, replace

****************************************
* E. Difficult Cases
****************************************

*In these cases we have to generate a weighted weight_make_flow which depends on how
*many machinery patents are assigned per IPC and current_sic code and which SIC subcodes
*occur per IPC and curren_sic codes.

*example: 1000 (not true in the data - fictional example) 
*If 1000 occurs alone for a specific IPC code - weight is average of subcodes 
*(so 1010, 1020, 1030, 1040, 1060, 1080, 1090)
*If 1000 occurs with 1040 and 1090 for a specific IPC code - weight is weighted
*average of w1040 and w1090 - weighted by how many machinery patents are assigned from 
*this IPC code to the 1040 and 1090 industry 
*etc.

*There are two types of difficult cases
*I) normal one
*II) 2 level ones - when for example 2000 - has to be computed from the weighted 
*weight of 2011 2013, 2015, 2020, 2080 etc. but 2080 has to be created in the same 
*way from 2082, 2086 (fictional example) 


* E.I) The "normal" ones


*Only keep those special cases with their subcodes
gen keep=0
replace keep=1 if inlist(sic_ind_make,"1040","1044","1600","1623","1700","1731","2020","2024","2030")
replace keep=1 if inlist(sic_ind_make,"2033","2050","2052","2080","2082","2084","2085","2086","2090")
replace keep=1 if inlist(sic_ind_make,"2092","2100","2111","2250","2253","2300","2320","2330","2340")
replace keep=1 if inlist(sic_ind_make,"2390","2400","2421","2430","2452","2510","2511","2520","2522")
replace keep=1 if inlist(sic_ind_make,"2670")
replace keep=1 if inlist(sic_ind_make,"2673","2820","2821","2840","2842","2844","2890","2891","3080")
replace keep=1 if inlist(sic_ind_make,"3081","3086","3089","3100","3140","3220","3221","3270","3272")
replace keep=1 if inlist(sic_ind_make,"3310","3312","3317","3330","3334","3350","3357","3430","3433")
replace keep=1 if inlist(sic_ind_make,"3440","3442","3443","3444","3448","3530","3531","3532","3533")
replace keep=1 if inlist(sic_ind_make,"3537","3540","3541","3550","3555","3559","3560","3561","3562")
replace keep=1 if inlist(sic_ind_make,"3564","3567","3569","3570","3571","3572","3575","3576","3577")
replace keep=1 if inlist(sic_ind_make,"3578","3579","3580","3585","3620","3621","3630","3634","3670")
replace keep=1 if inlist(sic_ind_make,"3672","3674","3677","3678","3679","3690","3695","3910","3911")
replace keep=1 if inlist(sic_ind_make,"4210","4213","4400","4412","4700","4731","4950","4953","4955")
replace keep=1 if inlist(sic_ind_make,"5010","5013","5030","5031","5040","5045","5047","5070","5072")
replace keep=1 if inlist(sic_ind_make,"5080","5082","5084","5090","5093","5094","5099","5140","5141")
replace keep=1 if inlist(sic_ind_make,"5200","5211","5700","5712","5731","5735","5940","5944","5960")
replace keep=1 if inlist(sic_ind_make,"5961","6200","6211","6282","6510","6512","6513","7310","7311")
replace keep=1 if inlist(sic_ind_make,"7320","7323","7330","7331","7370","7371","7372","7373","7374")
replace keep=1 if inlist(sic_ind_make,"7377","7380","7381","7389","7500")
replace keep=1 if inlist(sic_ind_make,"7510","8090","8093","8700","8711","8721","8731","8734","8742")
keep if keep==1
drop keep 
sort ipc4 sic_ind_make

*Assign right level of aggregation
gen sic_ind_make_2=substr(sic_ind_make, 1, 2)
replace sic_ind_make_2= sic_ind_make_2 + "00"
gen sic_ind_make_3=substr(sic_ind_make, 1, 3)
replace sic_ind_make_3= sic_ind_make_3 + "0"

gen agg_level=""
foreach x in 1600 1700 2100 2300 2400 3100 4400 4700 5200 5700 6200 7500 8700{
    replace agg_level=sic_ind_make_2 if sic_ind_make_2=="`x'"
}
foreach x in  1040 2020 2030 2050 2080 2090 2250 2510 2520 2670 2820 2840 2890 3080 3220 3270 3310 3330 3350 3430 3440 3530 3540 3550 3560 3570 3580 3620 3630 3670 3690 3910 4210 4950 5010 5030 5040 5070 5080 5090 5140 5940 5960 6510 7310 7320 7330 7370 7380 8090 {
    replace agg_level=sic_ind_make_3 if sic_ind_make_3=="`x'"
}
duplicates tag agg_level ipc4, gen(dup)
replace dup = dup+1
drop sic_ind_make_2 sic_ind_make_3

save ${tmp_dir}/ipc4_sic_ind_make_ABCDE_pre.dta, replace

*If dup==1 then we cannot build a weighted weight (because the code occurs alone)
*and thus we simply assign the average of the weights at one aggregation level
*less - create those weights in a separate dataset and match them 

use ${alm_data_proc}/sic4_make_flow.dta, clear

*once again, set aggregation levels
gen sic_ind_make_3 = substr(sic_ind_make, 1,3)
replace sic_ind_make_3 = sic_ind_make_3 + "0"
gen sic_ind_make_2 = substr(sic_ind_make, 1,2)
replace sic_ind_make_2 = sic_ind_make_2 + "00"
gen agg_level = ""

foreach x in 1600 1700 2100 2300 2400 3100 4400 4700 5200 5700 6200 7500 8700{
    replace agg_level=sic_ind_make_2 if sic_ind_make_2=="`x'"
}
foreach x in 1040 2020 2030 2050 2080 2090 2250 2510 2520 2670 2820 2840 2890 3080 3220 3270 3310 3330 3350 3430 3440 3530 3540 3550 3560 3570 3580 3620 3630 3670 3690 3910 4210 4950 5010 5030 5040 5070 5080 5090 5140 5940 5960 6510 7310 7320 7330 7370 7380 8090 {
    replace agg_level=sic_ind_make_3 if sic_ind_make_3=="`x'"
}
keep if agg_level != ""

* First average
egen weight_make_flow_avg1 = wtmean(weight_make_flow) , weight(total_value_make) by(sic_ind_flow sic_ind_make_3)
duplicates drop sic_ind_flow sic_ind_make_3 , force
replace weight_make_flow = weight_make_flow_avg1
drop weight_make_flow_avg1

* Second average
egen weight_make_flow_avg2 = wtmean(weight_make_flow), weight(total_value_make) by(agg_level sic_ind_flow)
duplicates drop sic_ind_flow agg_level, force
keep weight_make_flow_avg2 agg_level sic_ind_flow
rename weight_make_flow_avg2 weight_make_flow_E

* Match with the ongoing data
gen dup=1
save ${tmp_dir}/ipc4_sic_ind_make_E.dta, replace

use ${tmp_dir}/ipc4_sic_ind_make_ABCDE_pre.dta, replace

joinby agg_level dup using ${tmp_dir}/ipc4_sic_ind_make_E.dta , unmatched(both)

drop if _merge==2
assert _merge==3 if dup==1
assert _merge==1 if dup!=1
rename _merge _merge_E
rename sic_ind_flow sic_ind_flow_E
sort ipc4 agg_level sic_ind_make

*Drop the cases where less agg SIC code occurs but the one we would have to aggregate not.
*so in ipc4 A01B 2821 occurs but 2820 not - so we can drop 2821 (fictional example)
gen keep = 1 if agg_level == sic_ind_make
egen sum_keep=total(keep), by(agg_level ipc4)
assert sum_keep == 933 if dup == 1
drop if sum_keep == 0
drop sum_keep

*Generate weighted weight of w_2 for agg_level where dup!=1
egen total1 = sum(weight_ipc4_sic_mfg) if dup!=1 & keep!=1, by(ipc4 agg_level sic_ind_flow_ABCD)
gen temp1 = weight_ipc4_sic_mfg/total1 if dup!=1 & keep!=1
gen temp2 = temp1 * weight_make_flow_ABCD if dup!=1 & keep!=1
egen weight_make_flow_E2 =total(temp2), by(ipc4 agg_level sic_ind_flow_ABCD) 
replace weight_make_flow_E2 =. if dup==1 & keep==1
replace weight_make_flow_E2 =. if keep==1 
duplicates drop agg_level sic_ind_flow_ABCD ipc4 if dup!=1 & keep!=1 , force
drop total1 temp1 temp2
drop if keep==1 & dup!=1 & weight_make_flow_E2 ==. 

*check if we have indeed missing weights.
* we cant have missing weights if we have already aggregated on a level below
*vice versa we should not have have weights and then have averaged out
assert weight_make_flow_E ==. if weight_make_flow_D !=. 
assert weight_make_flow_E2 ==. if weight_make_flow_E!=. 

*unify the two approaches
gen weight_make_flow_E3 = weight_make_flow_E2
replace weight_make_flow_E3 =weight_make_flow_E if keep==1 & dup==1
assert weight_make_flow_E3!=.

*now we should not have any missing anymore for the cases where we have to aggregate from the same level
gen sic_ind_flow_E3 = sic_ind_flow_ABCD
replace sic_ind_flow_E3 = sic_ind_flow_E if keep==1 & dup==1
assert sic_ind_flow_E3!=""

keep ipc4 agg_level weight_make_flow_E3 sic_ind_flow_E3
rename agg_level sic_ind_make

save ${tmp_dir}/ipc4_sic_ind_make_E3.dta, replace

*match with the data up to D
use ${tmp_dir}/ipc4_sic_ind_make_ABCD.dta, clear
joinby ipc4 sic_ind_make using ${tmp_dir}/ipc4_sic_ind_make_E3.dta, unmatched(both)
rename _merge _merge_E3
assert _merge_ABCD ==1 if _merge_E3 == 3

* update the variables
gen weight_make_flow_ABCDE = weight_make_flow_ABCD
replace weight_make_flow_ABCDE = weight_make_flow_E3 if _merge_E3==3

gen sic_ind_flow_ABCDE = sic_ind_flow_ABCD
replace sic_ind_flow_ABCDE =sic_ind_flow_E if _merge_E3==3

gen _merge_ABCDE = _merge_ABCD
replace _merge_ABCDE =_merge_E3 if _merge_E3==3

save ${tmp_dir}/ipc4_sic_ind_make_ABCDE.dta, replace


* E.II) Level 2 cases
* ------------------

*Only keep those special cases with their subcodes
gen keep=0
replace keep=1 if inlist(sic_ind_make,"1000","1040","1044","1090","2000","2011","2013","2015")
replace keep=1 if inlist(sic_ind_make,"2020","2024","2030","2033","2040","2050","2052","2060")
replace keep=1 if inlist(sic_ind_make,"2070","2080","2082","2084","2085","2086","2090","2092")
replace keep=1 if inlist(sic_ind_make,"2200","2211","2221","2250","2253","2273","2600","2611")
replace keep=1 if inlist(sic_ind_make,"2621","2631","2650","2670","2673","2800","2810","2820")
replace keep=1 if inlist(sic_ind_make,"2821","2833","2834","2835","2836","2840","2842","2844")
replace keep=1 if inlist(sic_ind_make,"2851","2860","2870","2890","2891","3600","3612","3613")
replace keep=1 if inlist(sic_ind_make,"3620","3621","3630","3634","3640","3651","3652","3661")
replace keep=1 if inlist(sic_ind_make,"3663","3669","3670","3672","3674","3677","3678","3679")
replace keep=1 if inlist(sic_ind_make,"3690","3695","5000","5010","5013","5020","5030","5031")
replace keep=1 if inlist(sic_ind_make,"5040","5045","5047","5051","5063","5064","5065","5070")
replace keep=1 if inlist(sic_ind_make,"5072","5080","5082","5084","5090","5093","5094","5099")
replace keep=1 if inlist(sic_ind_make,"5900","5912","5940","5944","5960","5961","5990","6500")
replace keep=1 if inlist(sic_ind_make,"6510","6512","6513","6531","6552")
keep if keep==1
drop keep 
sort ipc4 sic_ind_make

*Assign right level of aggregation
gen agg_level=substr(sic_ind_make, 1, 2)
replace agg_level= agg_level + "00"
duplicates tag agg_level ipc4, gen(dup)
replace dup=dup+1

*If dup==1 then we cannot build a weighted weight (because the code occurs alone)
*and thus we simply assign the average of the weights at one aggregation level
*less - create those weights in a separate dataset and match them 

save ${tmp_dir}/ipc4_sic_ind_make_ABCDE2_pre.dta, replace

use ${alm_data_proc}/sic4_make_flow.dta, clear
gen agg_level=substr(sic_ind_make, 1,2)
replace agg_level=agg_level + "00"
gen keep=0
foreach x in 1000 2000 2200 2600 2800 3600 5000 5900 6500{
    replace keep=1 if agg_level=="`x'"
}
drop if keep==0
drop keep
gen agg_level_1=substr(sic_ind_make, 1,3)
replace agg_level_1=agg_level_1 + "0"

*First average
egen weight_make_flow_intm = wtmean(weight_make_flow), weight(total_value_make) by(sic_ind_flow agg_level_1)
duplicates drop sic_ind_flow agg_level_1 , force
replace weight_make_flow = weight_make_flow_intm 
drop weight_make_flow_intm sic_ind_make

*Second average
egen weight_make_flow_intm_2 = wtmean(weight_make_flow), weight(total_value_make) by(agg_level sic_ind_flow)
duplicates drop sic_ind_flow agg_level, force
keep weight_make_flow_intm_2 agg_level sic_ind_flow
rename weight_make_flow_intm_2 weight_make_flow_E4

gen dup=1
save ${tmp_dir}/ipc4_sic_ind_make_E.dta, replace

*merge in the template for aggregation
use ${tmp_dir}/ipc4_sic_ind_make_ABCDE2_pre, clear
joinby agg_level dup using ${tmp_dir}/ipc4_sic_ind_make_E.dta, unmatched(both)
drop if _merge==2
assert _merge==3 if dup==1
assert _merge==1 if dup!=1
rename _merge _merge_E2
rename sic_ind_flow sic_ind_flow_E4

sort ipc4 agg_level sic_ind_make

*Drop the cases where less agg SIC code occurs but one we would have to aggregate not
*so in ipc4 A01B 2821 occurs but 2820 not - so we can drop 2821 (fictional example)
gen keep=1 if agg_level==sic_ind_make
egen sum_keep=total(keep), by(agg_level ipc4)
assert sum_keep==933 if dup==1
drop if sum_keep==0
drop sum_keep

*Generate weighted weight for agg_level where dup!=1
*use weight from one level below only (so if 2250 and 2253 occurs in 2200 only
*use weight from 2250 but the machinery patent counts (the w_1_II) from 2250 and 2253 aggregated)
*like this it is similar to the average of average but with weight (is often the
*same as when we use weights and patent counts from each level (so from 2250 and
*2253) because 2250 often same weight as 2251). 

*First aggregate some patent counts by 3-digit level
gen agg_level_1=substr(sic_ind_make, 1,3)
replace agg_level_1=agg_level_1 + "0"

*Assign patent counts to higher level (3 digit code) if existent 
gen tag= 1 if sic_ind_make==agg_level_1 & keep!=1
egen sum_tag = total(tag), by(agg_level_1 ipc4)
egen agg = total(weight_ipc4_sic_mfg) if sum_tag==933, by(agg_level_1 ipc4 sic_ind_flow_ABCDE)

*Drop the lower level 
drop if tag==. & sum_tag==933
replace weight_ipc4_sic_mfg = agg if tag==1
drop agg

egen t1=total(weight_ipc4_sic_mfg) if keep!=1, by(agg_level ipc sic_ind_flow_ABCDE)
gen t2=weight_ipc4_sic_mfg/t1 if keep!=1
gen t3=t2*weight_make_flow_ABCDE if keep!=1 
egen weight_make_flow_E5 = total(t3), by(agg_level ipc4 sic_ind_flow_ABCDE)

replace weight_make_flow_E5 =. if dup==1 & keep==1
replace weight_make_flow_E5 =. if keep==1 
duplicates drop agg_level sic_ind_flow_ABCDE ipc4 if dup!=1 & keep!=1 , force
drop t1 t2 t3
drop if keep==1 & dup!=1 & weight_make_flow_E5==.

*again, check we have not mistakenly aggregated codes that don't need it
assert weight_make_flow_E5==. if weight_make_flow_E4!=.  
assert weight_make_flow_E4==. if weight_make_flow_E5!=. 

gen weight_make_flow_E6 = weight_make_flow_E5
replace weight_make_flow_E6 = weight_make_flow_E4 if keep==1 & dup==1
assert weight_make_flow_E6!=.

gen sic_ind_flow_E6 = sic_ind_flow_ABCDE
replace sic_ind_flow_E6 = sic_ind_flow_E4 if keep==1 & dup==1
assert sic_ind_flow_E6 !=""

keep ipc4 agg_level weight_make_flow_E6 sic_ind_flow_E6
rename agg_level sic_ind_make

save ${tmp_dir}/ipc4_sic_ind_make_E2.dta, replace

*match with the ongoing (large) datafile, update variables

use ${tmp_dir}/ipc4_sic_ind_make_ABCDE.dta, clear
joinby ipc4 sic_ind_make using ${tmp_dir}/ipc4_sic_ind_make_E2.dta, unmatched(both)
rename _merge _merge_E6
assert _merge_ABCDE == 1 if _merge_E6 ==3

gen weight_make_flow_ABCDE2 = weight_make_flow_ABCDE
replace weight_make_flow_ABCDE2 = weight_make_flow_E6 if _merge_E6 ==3

gen sic_ind_flow_ABCDE2 = sic_ind_flow_ABCDE
replace sic_ind_flow_ABCDE2 = sic_ind_flow_E6 if _merge_E6 ==3

gen _merge_ABCDE2 = _merge_ABCDE
replace _merge_ABCDE2 = _merge_E6 if _merge_E6 == 3
drop *_ABCD *_E* 



****************************************
* F. Finalize mapping
****************************************

ren (sic_ind_flow_ABCDE2 weight_make_flow_ABCDE2) (sic_ind_flow_ed weight_make_flow_ed)
keep sic_ind_make ipc4 weight_ipc4_sic_mfg sic_ind_flow_ed weight_make_flow_ed

*Drop the sic_ind_flow_final==. --> which indicates together with the 
*w_2_final how much of the specific sic_ind_make maps into a dead end because
*there is no correpsonding commodity in the flow table. Kept until here to 
*have the correct w_2_final in the steps above (when taking average etc.)
*But now we drop these observations and reshuffle later
drop if sic_ind_flow_ed =="."

*multiply the weights, sum up by technology and industry of use
gen weight_multiplied = weight_ipc4_sic_mfg * weight_make_flow_ed
egen weight_ipc4_sic_using = total(weight_multiplied), by(ipc4 sic_ind_flow_ed)
duplicates drop ipc4 sic_ind_flow_ed, force

* test whether weights are <1
egen total=total(weight_ipc4_sic_using), by(ipc4)
cap assert total < 1
drop total
keep ipc4 sic_ind_flow_ed weight_ipc4_sic_using

*Reshuffle such that in the end each IPC4 has a sum of weights of 1 into using
*industries
egen tot_byipc4 =total(weight_ipc4_sic_using), by(ipc4)
replace weight_ipc4_sic_using=weight_ipc4_sic_using/tot_byipc4
replace weight_ipc4_sic_using = 0 if tot_byipc4 == 0

* TODO when using the comm restrictions, 1.8k of 611k ipc4-sic comb have weight_ipc4_sic_using of 0, so I do this
drop tot_byipc4

* test weight_ipc4_sic_using == 1
egen total2 = total(weight_ipc4_sic_using), by(ipc4)
cap assert total2 > 0.99999 & total2 < 1.000001 
drop total2

sort ipc4 sic_ind_flow
rename sic_ind_flow_ed sic_ind_flow

*save the finished concordance table beteween IPC4 and SIC industry of use (=flow)
save ${alm_data_proc}/ipc4_sic4_ind_using_concordance.dta, replace

ren sic_ind_flow sic4_use
ren weight_ipc4_sic_using w_ipc4_sic4_use
save ${alm_data_proc}/cw_ipc4_sic4_use.dta, replace

}
if _rc == 0 {
    display "Execution finished successfully."
}
else {
    display "Execution finished with errors."
}

cap log close dat